/*
//              INTEL CORPORATION PROPRIETARY INFORMATION
//  This software is supplied under the terms of a license  agreement or
//  nondisclosure agreement with Intel Corporation and may not be copied
//  or disclosed except in  accordance  with the terms of that agreement.
//    Copyright (c) 2007 Intel Corporation. All Rights Reserved.
//
*/
#include "umc_defs.h"
#if defined (UMC_ENABLE_DVHD_VIDEO_ENCODER)

#include "umc_dv100_enc_segment_compressor.h"
#include "umc_dv_enc_block.h"
#include "umc_dv_enc_huffman.h"
#include <ippi.h>
#include <ippvc.h>


namespace UMC
{

const SQuantTableElem DV100SegmentCompressor :: QuantizeTable[] =
{
    //QuantStep QNO ClassNum
    {1,         1,  0},
    {2,         2,  0},
    {3,         3,  0},
    {4,         4,  0},
    {5,         5,  0},
    {6,         6,  0},
    {7,         7,  0},
    {8,         8,  0},
    {10,        5,  1},
    {12,        6,  1},
    {14,        7,  1},
    {16,        9,  0},
    {18,       10,  0},
    {20,       11,  0},
    {22,       12,  0},
    {24,       13,  0},
    {28,       14,  0},
    {32,        9,  1},
    {36,       10,  1},
    {40,       11,  1},
    {44,       12,  1},
    {48,       13,  1},
    {52,       15,  0},
    {56,       14,  1},
    {64,        9,  2},
    {72,       10,  2},
    {80,       11,  2},
    {88,       12,  2},
    {96,       13,  2},
    {104,      15,  1},
    {112,      14,  2}
};

DV100SegmentCompressor::DV100SegmentCompressor(void):m_iQuantPredictor(6)
{
}

DV100SegmentCompressor::~DV100SegmentCompressor(void)
{
}

// compress DV100 segment
void DV100SegmentCompressor::CompressSegment(struct DV_SEGMENT *lpDVSegment)
{
    ResetEncodeBitStream();

    DoDCT();

    ZigZagDCTBlocks();

    DoQuantizationAndWeighting(lpDVSegment);

    EncodeHuffman(lpDVSegment);
}

inline
static
void ZigZagBlock(Ipp32u** RLmem, Ipp16s *lpsDst)
{
    Ipp32s CurrElem;
    Ipp32u i, temp, FirstElemAfterNonZeroElem = 1, *tempRL = *RLmem;

    *tempRL++ = (lpsDst[0] / 4);
    for (i = 1;i < 64;i++)
    {
        CurrElem = lpsDst[DeZigzagIndex0[i]];
        if(CurrElem)
        {
            temp = (CurrElem & 0x8000) | (i << 8) | (i - FirstElemAfterNonZeroElem);
            CurrElem = abs(CurrElem);
            temp |= (CurrElem << 16);
            *tempRL++ = temp;
            FirstElemAfterNonZeroElem = i + 1;
        }
    }
    *tempRL++ = 0x4000; //0x4000 == 64<<8
    *RLmem = tempRL;
} //void ZigZagBlock(...)

void DV100SegmentCompressor::ZigZagDCTBlocks()
{
    Ipp32u *RLmemberSeg = m_VSegment.m_pDCTBlocks[0].m_lpsDataRL;
    Ipp32s DCTBlockNum;
    for(DCTBlockNum=0; DCTBlockNum<8*5; DCTBlockNum++)
    {
        BLOCK *lpBlocks = m_VSegment.m_pDCTBlocks + DCTBlockNum;
        lpBlocks->m_lpsDataRL = RLmemberSeg;
        ZigZagBlock(&RLmemberSeg, lpBlocks->m_lpsData);
    }
}

void DV100SegmentCompressor::DoDCT()
{
    Ipp16s FieldProcessedDCTBlocks[64*8];
    Ipp32s DCTBlockNum, MBNum, y;
    Ipp32u TotalFrameDCTZeroes, TotalFieldDCTZeroes, CurrZeroes;

    for(MBNum=0; MBNum<5; MBNum++)
    {
        //Rearrange pixeles in vertically adjacent DCT blocks
        for(DCTBlockNum=0; DCTBlockNum<4; DCTBlockNum++)
        {
            Ipp16s *pUpperFieldDCTBlock, *pLowerFieldDCTBlock, *pUpperSrcBlock, *pLowerSrcBlock;
            if(DCTBlockNum < 2)
            {
                pUpperSrcBlock = m_VSegment.m_pDCTBlocks[8*MBNum + DCTBlockNum].m_lpsData;
                pLowerSrcBlock = m_VSegment.m_pDCTBlocks[8*MBNum + DCTBlockNum+2].m_lpsData;
                pUpperFieldDCTBlock = FieldProcessedDCTBlocks + DCTBlockNum*64;
                pLowerFieldDCTBlock = pUpperFieldDCTBlock + 2*64;
            }
            else
            {
                pUpperSrcBlock = m_VSegment.m_pDCTBlocks[8*MBNum + 2*DCTBlockNum].m_lpsData;
                pLowerSrcBlock = m_VSegment.m_pDCTBlocks[8*MBNum + 2*DCTBlockNum+1].m_lpsData;
                pUpperFieldDCTBlock = FieldProcessedDCTBlocks + 2*DCTBlockNum*64;
                pLowerFieldDCTBlock = pUpperFieldDCTBlock + 64;
            }

            for(y=0; y<4; y++)
            {
                memcpy(pUpperFieldDCTBlock+y*8, pUpperSrcBlock+y*2 * 8, 8*sizeof(Ipp16s));
                memcpy(pLowerFieldDCTBlock+y*8, pUpperSrcBlock+(y*2 + 1) * 8, 8*sizeof(Ipp16s));

                memcpy(pUpperFieldDCTBlock+(y+4)*8, pLowerSrcBlock+y*2 * 8, 8*sizeof(Ipp16s));
                memcpy(pLowerFieldDCTBlock+(y+4)*8, pLowerSrcBlock+(y*2 + 1) * 8, 8*sizeof(Ipp16s));
            }
        }

        TotalFrameDCTZeroes = 0;
        TotalFieldDCTZeroes = 0;
        for(DCTBlockNum=0; DCTBlockNum<8; DCTBlockNum++)
        {
            BLOCK *lpBlock = m_VSegment.m_pDCTBlocks + MBNum*8 + DCTBlockNum;
            ippiDCT8x8Fwd_16s_C1I(lpBlock->m_lpsData);
            Ipp16s *currDCTBlock = lpBlock->m_lpsData;
            {
                //DC not weighted!!!
                //Scaling DC
                //currDCTBlock[0] <<=5;
                //Perform weighting
                //currDCTBlock[0] >>= 7;
                const Ipp32u *pQuantizeMatrix;
                if ((m_VideoFormat == mSystem720_60p) || (m_VideoFormat == mSystem720_50p))
                    pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_720System : ChromaQuantizeMatrix_720System;
                else
                    pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_1080System : ChromaQuantizeMatrix_1080System;
                for (int i = 1; i < 64; i++)
                {
                    currDCTBlock[i] <<=5;
                    Ipp32s temp = (Ipp32s) (((Ipp32s) currDCTBlock[i])
                                            / ((Ipp32s)pQuantizeMatrix[i]));
                    currDCTBlock[i] = (Ipp16s) temp;
                }
            }
            lpBlock->m_cM0 = 0;
            ippiCountZeros8x8_16s_C1(lpBlock->m_lpsData, &CurrZeroes);
            TotalFrameDCTZeroes += CurrZeroes;

            ippiDCT8x8Fwd_16s_C1I(FieldProcessedDCTBlocks + DCTBlockNum*64);
            currDCTBlock = (FieldProcessedDCTBlocks + DCTBlockNum*64);
            {
                const Ipp32u *pQuantizeMatrix;
                if ((m_VideoFormat == mSystem720_60p) || (m_VideoFormat == mSystem720_50p))
                    pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_720System : ChromaQuantizeMatrix_720System;
                else
                    pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_1080System : ChromaQuantizeMatrix_1080System;
                for (int i = 1; i < 64; i++)
                {
                    currDCTBlock[i] <<=5;
                    Ipp32s temp = (Ipp32s) (((Ipp32s) currDCTBlock[i])
                                            / ((Ipp32s)pQuantizeMatrix[i]));
                    currDCTBlock[i] = (Ipp16s) temp;
                }
            }

            ippiCountZeros8x8_16s_C1(FieldProcessedDCTBlocks + DCTBlockNum*64, &CurrZeroes);
            TotalFieldDCTZeroes += CurrZeroes;
        }

        if(TotalFieldDCTZeroes > TotalFrameDCTZeroes)
        {
            for(DCTBlockNum=0; DCTBlockNum<8; DCTBlockNum++)
            {
                BLOCK *lpBlock = m_VSegment.m_pDCTBlocks + MBNum*8 + DCTBlockNum;
                memcpy(lpBlock->m_lpsData, FieldProcessedDCTBlocks + DCTBlockNum*64, 64*sizeof(Ipp16s));
                lpBlock->m_cM0 = 1;
            }
        }
    }
}

enum
{
    DV100_SEGMENT_LIMIT   = 76*8*5 //76*8 bits for each of 5 macro blocks
};


Ipp32s DV100SegmentCompressor::ComputeSegmentSize(Ipp32s ClassNum, Ipp32s QNO)
{
    Ipp32s SegmentSize = 0, DCTBlockNum, MBNum;
    const Ipp32u *pQuantizeMatrix;

    for(MBNum=0; MBNum<5; MBNum++)
    {
        for(DCTBlockNum = 0; DCTBlockNum < 8; DCTBlockNum++)
        {
            if ((m_VideoFormat == mSystem720_60p) || (m_VideoFormat == mSystem720_50p))
                pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_720System : ChromaQuantizeMatrix_720System;
            else
                pQuantizeMatrix = (DCTBlockNum < 4) ? LumaQuantizeMatrix_1080System : ChromaQuantizeMatrix_1080System;

            SegmentSize += GetEncodedQuantizedDCTBlockBitLength(m_VSegment.m_pDCTBlocks + MBNum*8  + DCTBlockNum, ClassNum, QNO, pQuantizeMatrix);
        }
    }

    return SegmentSize;
}



void DV100SegmentCompressor::DoQuantizationAndWeighting(struct DV_SEGMENT *lpDVSegment)
{
    Ipp32s nDCTBlockNum, MBNum;
    const Ipp32u *pQuantizeMatrix;

    Ipp32s QuantNum = m_iQuantPredictor;

    if( ComputeSegmentSize(QuantizeTable[QuantNum].ClassNum, QuantizeTable[QuantNum].QNO) >= DV100_SEGMENT_LIMIT )
    {
        if( QuantNum + 1 < (sizeof(QuantizeTable)/sizeof(QuantizeTable[0])) )
        {
            do
            {
                QuantNum++;
            }
            while ( ( QuantNum + 1 < (sizeof(QuantizeTable)/sizeof(QuantizeTable[0])) ) &&
                    (ComputeSegmentSize(QuantizeTable[QuantNum].ClassNum, QuantizeTable[QuantNum].QNO) >= DV100_SEGMENT_LIMIT) );
        }
    }
    else
    {
        while ((QuantNum > 0) && (ComputeSegmentSize(QuantizeTable[QuantNum-1].ClassNum, QuantizeTable[QuantNum-1].QNO) < DV100_SEGMENT_LIMIT) )
        {
           QuantNum--;
        }
    }

    lpDVSegment->m_EncodedMacroBlocks[0].staqno =
    lpDVSegment->m_EncodedMacroBlocks[1].staqno =
    lpDVSegment->m_EncodedMacroBlocks[2].staqno =
    lpDVSegment->m_EncodedMacroBlocks[3].staqno =
    lpDVSegment->m_EncodedMacroBlocks[4].staqno = (Ipp8u)QuantizeTable[QuantNum].QNO;

    for(MBNum=0; MBNum<5; MBNum++)
    {
        for(nDCTBlockNum = 0; nDCTBlockNum < 8; nDCTBlockNum++)
        {
            if ((m_VideoFormat == mSystem720_60p)|| (m_VideoFormat == mSystem720_50p))
                pQuantizeMatrix = (nDCTBlockNum < 4) ? LumaQuantizeMatrix_720System : ChromaQuantizeMatrix_720System;
            else
                pQuantizeMatrix = (nDCTBlockNum < 4) ? LumaQuantizeMatrix_1080System : ChromaQuantizeMatrix_1080System;


            BLOCK *pCurrBlock = m_VSegment.m_pDCTBlocks + MBNum*8  + nDCTBlockNum;
            pCurrBlock->m_cC1C0 = (Ipp8u)QuantizeTable[QuantNum].ClassNum;

            QuantizeDCTBlock(pCurrBlock, QuantizeTable[QuantNum].ClassNum, QuantizeTable[QuantNum].QNO, pQuantizeMatrix);
        }//for nDCTBlockNum = 0 to 7
    }
   m_iQuantPredictor  = QuantNum;
}


Ipp32s DV100SegmentCompressor::GetEncodedQuantizedDCTBlockBitLength(BLOCK *pDCTBlock, Ipp32s ClassNum, Ipp32s QNO, const Ipp32u *QuantizeMatrix)
{
    Ipp32u QuantStep = QuantizationSteps[QNO*4 + ClassNum], elem_weight;
    Ipp32s EncodedBlockSize = 16;// DC + m0 + c1c0 + eob
    Ipp32s curr_elem, run_len, amp, quantized_amp, elem_order;
    Ipp32u *lpsSrc = pDCTBlock->m_lpsDataRL;

    // first element is never prequantized
    curr_elem = *(++lpsSrc);
    run_len = 0;
    while( !(curr_elem & 0x4000) )
    {
        run_len += (curr_elem & 0x0ff);
        elem_order = DeZigzagIndex0[(curr_elem>>8) & 0x7f];
        elem_weight = QuantizeMatrix[elem_order];
        amp = (curr_elem >> 16);

        //!!!!!!!!!!!!!!!!!  Important  !!!!!!!!!!!!!!!!!!!!!!
        //Quantization is very obscurely described in standard.
        //No optimization should be performed until quantization
        //is completely clarified
        //quantized_amp = ((amp/QuantStep) * elem_weight) >> 3;
        quantized_amp = ((amp/QuantStep) * 1);// >> 3;

        if(quantized_amp>255)
            return DV100_SEGMENT_LIMIT;

        if( quantized_amp != 0 )
        {
            // code as single element
            if (quantized_amp <= TableMaxAmpOnRun[run_len])
                EncodedBlockSize += (EncodeTables[run_len][quantized_amp]).length;
            else // code as 2 element
            {
                EncodedBlockSize += (EncodeTables[run_len - 1][0x00]).length;
                EncodedBlockSize += (EncodeTables[0x00][quantized_amp]).length;
            }

            run_len = 0;
        }
        else run_len++;

        curr_elem = *(++lpsSrc);
    }

    return EncodedBlockSize;
}


void DV100SegmentCompressor::QuantizeDCTBlock(BLOCK *pDCTBlock, Ipp32s ClassNum, Ipp32s QNO, const Ipp32u *QuantizeMatrix)
{
    Ipp32u QuantStep = QuantizationSteps[QNO*4 + ClassNum], elem_weight;
    Ipp32s curr_elem, run_len, amp, quantized_amp, elem_order, signX;
    Ipp32u *lpsSrc = pDCTBlock->m_lpsDataRL;
    Ipp32u *lpsDest = pDCTBlock->m_lpsDataRL + 1;

    // first element is never quantized
    curr_elem = *(++lpsSrc);
    run_len = 0;
    while( !(curr_elem & 0x4000) )
    {
        run_len += (curr_elem & 0x0ff);
        elem_order = DeZigzagIndex0[(curr_elem>>8) & 0x7f];
        elem_weight = QuantizeMatrix[elem_order];
        amp = (curr_elem >> 16);

        //!!!!!!!!!!!!!!!!!  Important  !!!!!!!!!!!!!!!!!!!!!!
        //Quantization is very obscurely described in standard.
        //No optimization should be performed until quantization
        //is completely clarified
        //quantized_amp = ((amp/QuantStep) * elem_weight) >> 3;
        quantized_amp = ((amp/QuantStep) * 1);// >> 3;

        if( quantized_amp != 0 )
        {
            signX = (curr_elem & 0x8000) << 1;

            if (quantized_amp <= TableMaxAmpOnRun[run_len])// code as single element
                *lpsDest++ = ((EncodeTables[run_len][quantized_amp]).code << 16) | signX | ((EncodeTables[run_len][quantized_amp]).length);
            else // code as 2 element
                *lpsDest++ = ((EncodeTables[0][quantized_amp]).code << 16) | signX | ((EncodeTables[0][quantized_amp]).length) | (run_len << 8);

            run_len = 0;
        }
        else run_len++;

        curr_elem = *(++lpsSrc);
    }
    *lpsDest = 0x4000;
}


}//namespace UMC

#endif // (UMC_ENABLE_DVHD_VIDEO_ENCODER)
